{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 第6章 数据加载、存储与文件格式\n",
    "# https://www.jianshu.com/p/047d8c1c7e14\n",
    "\n",
    "# 输入输出通常可以划分为几个大类：\n",
    "# 读取文本文件和其他更高效的磁盘存储格式，\n",
    "# 加载数据库中的数据，\n",
    "# 利用Web API操作网络资源。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 6.1 读写文本格式的数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pandas提供了一些用于将表格型数据读取为DataFrame对象的函数。\n",
    "# 表6-1对它们进行了总结，其中read_csv和read_table可能会是你今后用得最多的。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "'cat' 不是内部或外部命令，也不是可运行的程序\n",
      "或批处理文件。\n"
     ]
    }
   ],
   "source": [
    "!cat ex1.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " 驱动器 F 中的卷是 新加卷\n",
      " 卷的序列号是 A4FC-F3D6\n",
      "\n",
      " F:\\wxg103\\pythonProjects\\forDataAnalysis2\\ch06 的目录\n",
      "\n",
      "2019/01/29  17:45    <DIR>          .\n",
      "2019/01/29  17:45    <DIR>          ..\n",
      "2019/01/29  17:37    <DIR>          .ipynb_checkpoints\n",
      "2019/01/29  17:41             1,474 chapter06.ipynb\n",
      "2019/01/29  17:44                58 ex1.csv\n",
      "               2 个文件          1,532 字节\n",
      "               3 个目录 106,214,522,880 可用字节\n"
     ]
    }
   ],
   "source": [
    "!dir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "a,b,c,d,message\n",
      "1,2,3,4,hello\n",
      "5,6,7,8,world\n",
      "9,10,11,12,foo\n"
     ]
    }
   ],
   "source": [
    "!type ex1.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>message</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>hello</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>world</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "      <td>12</td>\n",
       "      <td>foo</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a   b   c   d message\n",
       "0  1   2   3   4   hello\n",
       "1  5   6   7   8   world\n",
       "2  9  10  11  12     foo"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 由于该文件以逗号分隔，所以我们可以使用read_csv将其读入一个DataFrame：\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "df = pd.read_csv('ex1.csv')\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>message</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>hello</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>world</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "      <td>12</td>\n",
       "      <td>foo</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a   b   c   d message\n",
       "0  1   2   3   4   hello\n",
       "1  5   6   7   8   world\n",
       "2  9  10  11  12     foo"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 我们还可以使用read_table，并指定分隔符：\n",
    "pd.read_table('ex1.csv', sep=',')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1,2,3,4,hello\n",
      "5,6,7,8,world\n",
      "9,10,11,12,foo\n"
     ]
    }
   ],
   "source": [
    "# 并不是所有文件都有标题行。看看下面这个文件：\n",
    "!type ex2.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>hello</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>world</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "      <td>12</td>\n",
       "      <td>foo</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   0   1   2   3      4\n",
       "0  1   2   3   4  hello\n",
       "1  5   6   7   8  world\n",
       "2  9  10  11  12    foo"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 读入该文件的办法有两个。\n",
    "# 你可以让pandas为其分配默认的列名，也可以自己定义列名：\n",
    "pd.read_csv('ex2.csv', header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>message</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>hello</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>world</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "      <td>12</td>\n",
       "      <td>foo</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a   b   c   d message\n",
       "0  1   2   3   4   hello\n",
       "1  5   6   7   8   world\n",
       "2  9  10  11  12     foo"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.read_csv('ex2.csv', names=['a', 'b', 'c', 'd', 'message'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>message</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>hello</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>world</th>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>foo</th>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         a   b   c   d\n",
       "message               \n",
       "hello    1   2   3   4\n",
       "world    5   6   7   8\n",
       "foo      9  10  11  12"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 假设你希望将message列做成DataFrame的索引。\n",
    "# 你可以明确表示要将该列放到索引4的位置上，也可以通过index_col参数指定\"message\"：\n",
    "\n",
    "names = ['a', 'b', 'c', 'd', 'message']\n",
    "pd.read_csv('ex2.csv', names=names, index_col='message')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "key1,key2,value1,value2\n",
      "one,a,1,2\n",
      "one,b,3,4\n",
      "one,c,5,6\n",
      "one,d,7,8\n",
      "two,a,9,10\n",
      "two,b,11,12\n",
      "two,c,13,14\n",
      "two,d,15,16\n"
     ]
    }
   ],
   "source": [
    "# 如果希望将多个列做成一个层次化索引，只需传入由列编号或列名组成的列表即可：\n",
    "!type csv_mindex.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>value1</th>\n",
       "      <th>value2</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>key1</th>\n",
       "      <th>key2</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"4\" valign=\"top\">one</th>\n",
       "      <th>a</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>d</th>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"4\" valign=\"top\">two</th>\n",
       "      <th>a</th>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>11</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>13</td>\n",
       "      <td>14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>d</th>\n",
       "      <td>15</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           value1  value2\n",
       "key1 key2                \n",
       "one  a          1       2\n",
       "     b          3       4\n",
       "     c          5       6\n",
       "     d          7       8\n",
       "two  a          9      10\n",
       "     b         11      12\n",
       "     c         13      14\n",
       "     d         15      16"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "parsed = pd.read_csv('csv_mindex.csv',\n",
    "                     index_col=['key1', 'key2'])\n",
    "parsed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['            A         B         C\\n',\n",
       " 'aaa -0.264438 -1.026059 -0.619500\\n',\n",
       " 'bbb  0.927272  0.302904 -0.032399\\n',\n",
       " 'ccc -0.264273 -0.386314 -0.217601\\n',\n",
       " 'ddd -0.871858 -0.348382  1.100491\\n']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 有些情况下，有些表格可能不是用固定的分隔符去分隔字段的（比如空白符或其它模式）。\n",
    "# 看看下面这个文本文件：\n",
    "\n",
    "list(open('ex3.txt'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>aaa</th>\n",
       "      <td>-0.264438</td>\n",
       "      <td>-1.026059</td>\n",
       "      <td>-0.619500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bbb</th>\n",
       "      <td>0.927272</td>\n",
       "      <td>0.302904</td>\n",
       "      <td>-0.032399</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ccc</th>\n",
       "      <td>-0.264273</td>\n",
       "      <td>-0.386314</td>\n",
       "      <td>-0.217601</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ddd</th>\n",
       "      <td>-0.871858</td>\n",
       "      <td>-0.348382</td>\n",
       "      <td>1.100491</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            A         B         C\n",
       "aaa -0.264438 -1.026059 -0.619500\n",
       "bbb  0.927272  0.302904 -0.032399\n",
       "ccc -0.264273 -0.386314 -0.217601\n",
       "ddd -0.871858 -0.348382  1.100491"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 虽然可以手动对数据进行规整，这里的字段是被数量不同的空白字符间隔开的。\n",
    "# 这种情况下，你可以传递一个正则表达式作为read_table的分隔符。\n",
    "# 可以用正则表达式表达为\\s+，于是有：\n",
    "\n",
    "result = pd.read_table('ex3.txt', sep='\\s+')\n",
    "\n",
    "# 这里，由于列名比数据行的数量少，所以read_table推断第一列应该是DataFrame的索引。\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# hey!\n",
      "a,b,c,d,message\n",
      "# just wanted to make things more difficult for you\n",
      "# who reads CSV files with computers, anyway?\n",
      "1,2,3,4,hello\n",
      "5,6,7,8,world\n",
      "9,10,11,12,foo\n"
     ]
    }
   ],
   "source": [
    "# 这些解析器函数还有许多参数可以帮助你处理各种各样的异形文件格式（表6-2列出了一些）。\n",
    "# 比如说，你可以用skiprows跳过文件的第一行、第三行和第四行：\n",
    "\n",
    "!type ex4.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>message</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>hello</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>world</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "      <td>12</td>\n",
       "      <td>foo</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a   b   c   d message\n",
       "0  1   2   3   4   hello\n",
       "1  5   6   7   8   world\n",
       "2  9  10  11  12     foo"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.read_csv('ex4.csv', skiprows=[0, 2, 3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "something,a,b,c,d,message\n",
      "one,1,2,3,4,NA\n",
      "two,5,6,,8,world\n",
      "three,9,10,11,12,foo\n"
     ]
    }
   ],
   "source": [
    "# 缺失值处理是文件解析任务中的一个重要组成部分。\n",
    "# 缺失数据经常是要么没有（空字符串），要么用某个标记值表示。\n",
    "# 默认情况下，pandas会用一组经常出现的标记值进行识别，比如NA及NULL：\n",
    "\n",
    "!type ex5.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>something</th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>message</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>one</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3.0</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>two</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>world</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>three</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11.0</td>\n",
       "      <td>12</td>\n",
       "      <td>foo</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  something  a   b     c   d message\n",
       "0       one  1   2   3.0   4     NaN\n",
       "1       two  5   6   NaN   8   world\n",
       "2     three  9  10  11.0  12     foo"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result = pd.read_csv('ex5.csv')\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>something</th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>message</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   something      a      b      c      d  message\n",
       "0      False  False  False  False  False     True\n",
       "1      False  False  False   True  False    False\n",
       "2      False  False  False  False  False    False"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.isnull(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>something</th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>message</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>one</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3.0</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>two</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>world</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>three</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11.0</td>\n",
       "      <td>12</td>\n",
       "      <td>foo</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  something  a   b     c   d message\n",
       "0       one  1   2   3.0   4     NaN\n",
       "1       two  5   6   NaN   8   world\n",
       "2     three  9  10  11.0  12     foo"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# na_values可以用一个列表或集合的字符串表示缺失值：\n",
    "\n",
    "result = pd.read_csv('ex5.csv', na_values=['NULL'])\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>something</th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>message</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>one</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3.0</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>world</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>three</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11.0</td>\n",
       "      <td>12</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  something  a   b     c   d message\n",
       "0       one  1   2   3.0   4     NaN\n",
       "1       NaN  5   6   NaN   8   world\n",
       "2     three  9  10  11.0  12     NaN"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 字典的各列可以使用不同的NA标记值：\n",
    "\n",
    "sentinels = {'message': ['foo', 'NA'], 'something': ['two']}\n",
    "pd.read_csv('ex5.csv', na_values=sentinels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 逐块读取文本文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 在处理很大的文件时，或找出大文件中的参数集以便于后续处理时，\n",
    "# 你可能只想读取文件的一小部分或逐块对文件进行迭代。\n",
    "\n",
    "# 在看大文件之前，我们先设置pandas显示地更紧些：\n",
    "pd.options.display.max_rows = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>one</th>\n",
       "      <th>two</th>\n",
       "      <th>three</th>\n",
       "      <th>four</th>\n",
       "      <th>key</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.467976</td>\n",
       "      <td>-0.038649</td>\n",
       "      <td>-0.295344</td>\n",
       "      <td>-1.824726</td>\n",
       "      <td>L</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.358893</td>\n",
       "      <td>1.404453</td>\n",
       "      <td>0.704965</td>\n",
       "      <td>-0.200638</td>\n",
       "      <td>B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.501840</td>\n",
       "      <td>0.659254</td>\n",
       "      <td>-0.421691</td>\n",
       "      <td>-0.057688</td>\n",
       "      <td>G</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.204886</td>\n",
       "      <td>1.074134</td>\n",
       "      <td>1.388361</td>\n",
       "      <td>-0.982404</td>\n",
       "      <td>R</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.354628</td>\n",
       "      <td>-0.133116</td>\n",
       "      <td>0.283763</td>\n",
       "      <td>-0.837063</td>\n",
       "      <td>Q</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9995</th>\n",
       "      <td>2.311896</td>\n",
       "      <td>-0.417070</td>\n",
       "      <td>-1.409599</td>\n",
       "      <td>-0.515821</td>\n",
       "      <td>L</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9996</th>\n",
       "      <td>-0.479893</td>\n",
       "      <td>-0.650419</td>\n",
       "      <td>0.745152</td>\n",
       "      <td>-0.646038</td>\n",
       "      <td>E</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9997</th>\n",
       "      <td>0.523331</td>\n",
       "      <td>0.787112</td>\n",
       "      <td>0.486066</td>\n",
       "      <td>1.093156</td>\n",
       "      <td>K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9998</th>\n",
       "      <td>-0.362559</td>\n",
       "      <td>0.598894</td>\n",
       "      <td>-1.843201</td>\n",
       "      <td>0.887292</td>\n",
       "      <td>G</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999</th>\n",
       "      <td>-0.096376</td>\n",
       "      <td>-1.012999</td>\n",
       "      <td>-0.657431</td>\n",
       "      <td>-0.573315</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           one       two     three      four key\n",
       "0     0.467976 -0.038649 -0.295344 -1.824726   L\n",
       "1    -0.358893  1.404453  0.704965 -0.200638   B\n",
       "2    -0.501840  0.659254 -0.421691 -0.057688   G\n",
       "3     0.204886  1.074134  1.388361 -0.982404   R\n",
       "4     0.354628 -0.133116  0.283763 -0.837063   Q\n",
       "...        ...       ...       ...       ...  ..\n",
       "9995  2.311896 -0.417070 -1.409599 -0.515821   L\n",
       "9996 -0.479893 -0.650419  0.745152 -0.646038   E\n",
       "9997  0.523331  0.787112  0.486066  1.093156   K\n",
       "9998 -0.362559  0.598894 -1.843201  0.887292   G\n",
       "9999 -0.096376 -1.012999 -0.657431 -0.573315   0\n",
       "\n",
       "[10000 rows x 5 columns]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 然后有：\n",
    "\n",
    "result = pd.read_csv('ex6.csv')\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>one</th>\n",
       "      <th>two</th>\n",
       "      <th>three</th>\n",
       "      <th>four</th>\n",
       "      <th>key</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.467976</td>\n",
       "      <td>-0.038649</td>\n",
       "      <td>-0.295344</td>\n",
       "      <td>-1.824726</td>\n",
       "      <td>L</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.358893</td>\n",
       "      <td>1.404453</td>\n",
       "      <td>0.704965</td>\n",
       "      <td>-0.200638</td>\n",
       "      <td>B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.501840</td>\n",
       "      <td>0.659254</td>\n",
       "      <td>-0.421691</td>\n",
       "      <td>-0.057688</td>\n",
       "      <td>G</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.204886</td>\n",
       "      <td>1.074134</td>\n",
       "      <td>1.388361</td>\n",
       "      <td>-0.982404</td>\n",
       "      <td>R</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.354628</td>\n",
       "      <td>-0.133116</td>\n",
       "      <td>0.283763</td>\n",
       "      <td>-0.837063</td>\n",
       "      <td>Q</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        one       two     three      four key\n",
       "0  0.467976 -0.038649 -0.295344 -1.824726   L\n",
       "1 -0.358893  1.404453  0.704965 -0.200638   B\n",
       "2 -0.501840  0.659254 -0.421691 -0.057688   G\n",
       "3  0.204886  1.074134  1.388361 -0.982404   R\n",
       "4  0.354628 -0.133116  0.283763 -0.837063   Q"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 如果只想读取几行（避免读取整个文件），通过nrows进行指定即可：\n",
    "\n",
    "pd.read_csv('ex6.csv', nrows=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<pandas.io.parsers.TextFileReader at 0x18aae828>"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 要逐块读取文件，可以指定chunksize（行数）：\n",
    "\n",
    "chunker = pd.read_csv('ex6.csv', chunksize=1000)\n",
    "chunker"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "E    368.0\n",
       "X    364.0\n",
       "L    346.0\n",
       "O    343.0\n",
       "Q    340.0\n",
       "M    338.0\n",
       "J    337.0\n",
       "F    335.0\n",
       "K    334.0\n",
       "H    330.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# read_csv所返回的这个TextParser对象使你可以根据chunksize对文件进行逐块迭代。\n",
    "# 比如说，我们可以迭代处理ex6.csv，将值计数聚合到\"key\"列中，如下所示：\n",
    "\n",
    "chunker = pd.read_csv('ex6.csv', chunksize=1000)\n",
    "\n",
    "tot = pd.Series([])\n",
    "for piece in chunker:\n",
    "    tot = tot.add(piece['key'].value_counts(), fill_value=0)\n",
    "    \n",
    "tot = tot.sort_values(ascending=False)\n",
    "\n",
    "tot[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TextParser还有一个get_chunk方法，它使你可以读取任意大小的块。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将数据写出到文本格式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>something</th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>message</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>one</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3.0</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>two</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>world</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>three</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11.0</td>\n",
       "      <td>12</td>\n",
       "      <td>foo</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  something  a   b     c   d message\n",
       "0       one  1   2   3.0   4     NaN\n",
       "1       two  5   6   NaN   8   world\n",
       "2     three  9  10  11.0  12     foo"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 数据也可以被输出为分隔符格式的文本。我们再来看看之前读过的一个CSV文件：\n",
    "data = pd.read_csv('ex5.csv')\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ",something,a,b,c,d,message\n",
      "0,one,1,2,3.0,4,\n",
      "1,two,5,6,,8,world\n",
      "2,three,9,10,11.0,12,foo\n"
     ]
    }
   ],
   "source": [
    "# 利用DataFrame的to_csv方法，我们可以将数据写到一个以逗号分隔的文件中：\n",
    "\n",
    "data.to_csv('out.csv')\n",
    "!type out.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "|something|a|b|c|d|message\n",
      "0|one|1|2|3.0|4|\n",
      "1|two|5|6||8|world\n",
      "2|three|9|10|11.0|12|foo\n"
     ]
    }
   ],
   "source": [
    "# 当然，还可以使用其他分隔符\n",
    "\n",
    "import sys\n",
    "data.to_csv(sys.stdout, sep='|')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ",something,a,b,c,d,message\n",
      "0,one,1,2,3.0,4,NULL\n",
      "1,two,5,6,NULL,8,world\n",
      "2,three,9,10,11.0,12,foo\n"
     ]
    }
   ],
   "source": [
    "# 缺失值在输出结果中会被表示为空字符串。\n",
    "# 你可能希望将其表示为别的标记值：\n",
    "\n",
    "data.to_csv(sys.stdout, na_rep='NULL')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "one,1,2,3.0,4,\n",
      "two,5,6,,8,world\n",
      "three,9,10,11.0,12,foo\n"
     ]
    }
   ],
   "source": [
    "# 如果没有设置其他选项，则会写出行和列的标签。\n",
    "# 当然，它们也都可以被禁用：\n",
    "\n",
    "data.to_csv(sys.stdout, index=False, header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "a,b,c\n",
      "1,2,3.0\n",
      "5,6,\n",
      "9,10,11.0\n"
     ]
    }
   ],
   "source": [
    "# 此外，你还可以只写出一部分的列，并以你指定的顺序排列：\n",
    "\n",
    "data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2000-01-01,0\n",
      "2000-01-02,1\n",
      "2000-01-03,2\n",
      "2000-01-04,3\n",
      "2000-01-05,4\n",
      "2000-01-06,5\n",
      "2000-01-07,6\n"
     ]
    }
   ],
   "source": [
    "# Series也有一个to_csv方法：\n",
    "\n",
    "dates = pd.date_range('1/1/2000', periods=7)\n",
    "ts = pd.Series(np.arange(7), index=dates)\n",
    "ts.to_csv('tseries.csv')\n",
    "!type tseries.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 处理分隔符格式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\"a\",\"b\",\"c\"\n",
      "\"1\",\"2\",\"3\"\n",
      "\"1\",\"2\",\"3\"\n"
     ]
    }
   ],
   "source": [
    "# 大部分存储在磁盘上的表格型数据都能用pandas.read_table进行加载。\n",
    "# 然而，有时还是需要做一些手工处理。\n",
    "# 由于接收到含有畸形行的文件而使read_table出毛病的情况并不少见。\n",
    "# 为了说明这些基本工具，看看下面这个简单的CSV文件：\n",
    "\n",
    "!type ex7.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 对于任何单字符分隔符文件，可以直接使用Python内置的csv模块。\n",
    "# 将任意已打开的文件或文件型的对象传给csv.reader：\n",
    "\n",
    "import csv\n",
    "f = open('ex7.csv')\n",
    "\n",
    "reader = csv.reader(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['a', 'b', 'c']\n",
      "['1', '2', '3']\n",
      "['1', '2', '3']\n"
     ]
    }
   ],
   "source": [
    "# 对这个reader进行迭代将会为每行产生一个元组（并移除了所有的引号）：\n",
    "\n",
    "for line in reader:\n",
    "    print(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 现在，为了使数据格式合乎要求，你需要对其做一些整理工作。\n",
    "# 我们一步一步来做。首先，读取文件到一个多行的列表中：\n",
    "\n",
    "with open('ex7.csv') as f:\n",
    "    lines = list(csv.reader(f))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 然后，我们将这些行分为标题行和数据行：\n",
    "header, values = lines[0], lines[1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 然后，我们可以用字典构造式和zip(*values)，后者将行转置为列，创建数据列的字典：\n",
    "\n",
    "data_dict = {h: v for h, v in zip(header, zip(*values))}\n",
    "data_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "# CSV文件的形式有很多。\n",
    "# 只需定义csv.Dialect的一个子类即可定义出新格式\n",
    "# （如专门的分隔符、字符串引用约定、行结束符等）：\n",
    "\n",
    "class my_dialect(csv.Dialect):\n",
    "    lineterminator = '\\n'\n",
    "    delimiter = ';'\n",
    "    quotechar = '\"'\n",
    "    quoting = csv.QUOTE_MINIMAL\n",
    "reader = csv.reader(f, dialect=my_dialect)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 各个CSV语支的参数也可以用关键字的形式提供给csv.reader，而无需定义子类：\n",
    "\n",
    "reader = csv.reader(f, delimiter='|')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 可用的选项（csv.Dialect的属性）及其功能如表6-3所示。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 要手工输出分隔符文件，你可以使用csv.writer。\n",
    "# 它接受一个已打开且可写的文件对象以及跟csv.reader相同的那些语支和格式化选项：\n",
    "\n",
    "with open('mydata.csv', 'w') as f:\n",
    "    writer = csv.writer(f, dialect=my_dialect)\n",
    "    writer.writerow(('one', 'two', 'three'))\n",
    "    writer.writerow(('1', '2', '3'))\n",
    "    writer.writerow(('4', '5', '6'))\n",
    "    writer.writerow(('7', '8', '9'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "one;two;three\n",
      "1;2;3\n",
      "4;5;6\n",
      "7;8;9\n"
     ]
    }
   ],
   "source": [
    "!type mydata.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "# JSON数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "# JSON（JavaScript Object Notation的简称）\n",
    "# 已经成为通过HTTP请求在Web浏览器和其他应用程序之间发送数据的标准格式之一。\n",
    "# 它是一种比表格型文本格式（如CSV）灵活得多的数据格式。下面是一个例子：\n",
    "\n",
    "obj = \"\"\"\n",
    "{\"name\": \"Wes\",\n",
    " \"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\n",
    " \"pet\": null,\n",
    " \"siblings\": [{\"name\": \"Scott\", \"age\": 30, \"pets\": [\"Zeus\", \"Zuko\"]},\n",
    "              {\"name\": \"Katie\", \"age\": 38,\n",
    "               \"pets\": [\"Sixes\", \"Stache\", \"Cisco\"]}]\n",
    "}\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'name': 'Wes',\n",
       " 'places_lived': ['United States', 'Spain', 'Germany'],\n",
       " 'pet': None,\n",
       " 'siblings': [{'name': 'Scott', 'age': 30, 'pets': ['Zeus', 'Zuko']},\n",
       "  {'name': 'Katie', 'age': 38, 'pets': ['Sixes', 'Stache', 'Cisco']}]}"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 它是构建于Python标准库中的。通过json.loads即可将JSON字符串转换成Python形式：\n",
    "\n",
    "import json\n",
    "result = json.loads(obj)\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'{\"name\": \"Wes\", \"places_lived\": [\"United States\", \"Spain\", \"Germany\"], \"pet\": null, \"siblings\": [{\"name\": \"Scott\", \"age\": 30, \"pets\": [\"Zeus\", \"Zuko\"]}, {\"name\": \"Katie\", \"age\": 38, \"pets\": [\"Sixes\", \"Stache\", \"Cisco\"]}]}'"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# json.dumps则将Python对象转换成JSON格式：\n",
    "asjson = json.dumps(result)\n",
    "asjson"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Scott</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Katie</td>\n",
       "      <td>38</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    name  age\n",
       "0  Scott   30\n",
       "1  Katie   38"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 如何将（一个或一组）JSON对象转换为DataFrame或其他便于分析的数据结构就由你决定了。\n",
    "# 最简单方便的方式是：向DataFrame构造器传入一个字典的列表（就是原先的JSON对象），\n",
    "# 并选取数据字段的子集：\n",
    "\n",
    "siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])\n",
    "siblings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{\"a\": 1, \"b\": 2, \"c\": 3},\n",
      " {\"a\": 4, \"b\": 5, \"c\": 6},\n",
      " {\"a\": 7, \"b\": 8, \"c\": 9}]\n"
     ]
    }
   ],
   "source": [
    "# pandas.read_json可以自动将特别格式的JSON数据集转换为Series或DataFrame。例如：\n",
    "\n",
    "!type example.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a  b  c\n",
       "0  1  2  3\n",
       "1  4  5  6\n",
       "2  7  8  9"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# pandas.read_json的默认选项假设JSON数组中的每个对象是表格中的一行：\n",
    "\n",
    "data = pd.read_json('example.json')\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"a\":{\"0\":1,\"1\":4,\"2\":7},\"b\":{\"0\":2,\"1\":5,\"2\":8},\"c\":{\"0\":3,\"1\":6,\"2\":9}}\n"
     ]
    }
   ],
   "source": [
    "# 如果你需要将数据从pandas输出到JSON，可以使用to_json方法：\n",
    "print(data.to_json())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{\"a\":1,\"b\":2,\"c\":3},{\"a\":4,\"b\":5,\"c\":6},{\"a\":7,\"b\":8,\"c\":9}]\n"
     ]
    }
   ],
   "source": [
    "print(data.to_json(orient='records'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "# XML和HTML：Web信息收集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Python有许多可以读写常见的HTML和XML格式数据的库，\n",
    "# 包括lxml、Beautiful Soup和html5lib。\n",
    "# lxml的速度比较快，但其它的库处理有误的HTML或XML文件更好。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pandas有一个内置的功能，read_html，\n",
    "# 它可以使用lxml和Beautiful Soup自动将HTML文件中的表格解析为DataFrame对象。\n",
    "# 为了进行展示，我从美国联邦存款保险公司下载了一个HTML文件（pandas文档中也使用过），\n",
    "# 它记录了银行倒闭的情况。首先，你需要安装read_html用到的库：\n",
    "\n",
    "# pip install lxml\n",
    "# pip install beautifulsoup4 html5lib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# pandas.read_html有一些选项，\n",
    "# 默认条件下，它会搜索、尝试解析<table>标签内的的表格数据。\n",
    "# 结果是一个列表的DataFrame对象：\n",
    "\n",
    "tables = pd.read_html('fdic_failed_bank_list.html')\n",
    "len(tables)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Bank Name</th>\n",
       "      <th>City</th>\n",
       "      <th>ST</th>\n",
       "      <th>CERT</th>\n",
       "      <th>Acquiring Institution</th>\n",
       "      <th>Closing Date</th>\n",
       "      <th>Updated Date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Allied Bank</td>\n",
       "      <td>Mulberry</td>\n",
       "      <td>AR</td>\n",
       "      <td>91</td>\n",
       "      <td>Today's Bank</td>\n",
       "      <td>September 23, 2016</td>\n",
       "      <td>November 17, 2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>The Woodbury Banking Company</td>\n",
       "      <td>Woodbury</td>\n",
       "      <td>GA</td>\n",
       "      <td>11297</td>\n",
       "      <td>United Bank</td>\n",
       "      <td>August 19, 2016</td>\n",
       "      <td>November 17, 2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>First CornerStone Bank</td>\n",
       "      <td>King of Prussia</td>\n",
       "      <td>PA</td>\n",
       "      <td>35312</td>\n",
       "      <td>First-Citizens Bank &amp; Trust Company</td>\n",
       "      <td>May 6, 2016</td>\n",
       "      <td>September 6, 2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Trust Company Bank</td>\n",
       "      <td>Memphis</td>\n",
       "      <td>TN</td>\n",
       "      <td>9956</td>\n",
       "      <td>The Bank of Fayette County</td>\n",
       "      <td>April 29, 2016</td>\n",
       "      <td>September 6, 2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>North Milwaukee State Bank</td>\n",
       "      <td>Milwaukee</td>\n",
       "      <td>WI</td>\n",
       "      <td>20364</td>\n",
       "      <td>First-Citizens Bank &amp; Trust Company</td>\n",
       "      <td>March 11, 2016</td>\n",
       "      <td>June 16, 2016</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      Bank Name             City  ST   CERT  \\\n",
       "0                   Allied Bank         Mulberry  AR     91   \n",
       "1  The Woodbury Banking Company         Woodbury  GA  11297   \n",
       "2        First CornerStone Bank  King of Prussia  PA  35312   \n",
       "3            Trust Company Bank          Memphis  TN   9956   \n",
       "4    North Milwaukee State Bank        Milwaukee  WI  20364   \n",
       "\n",
       "                 Acquiring Institution        Closing Date       Updated Date  \n",
       "0                         Today's Bank  September 23, 2016  November 17, 2016  \n",
       "1                          United Bank     August 19, 2016  November 17, 2016  \n",
       "2  First-Citizens Bank & Trust Company         May 6, 2016  September 6, 2016  \n",
       "3           The Bank of Fayette County      April 29, 2016  September 6, 2016  \n",
       "4  First-Citizens Bank & Trust Company      March 11, 2016      June 16, 2016  "
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "failures = tables[0]\n",
    "failures.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2010    157\n",
       "2009    140\n",
       "2011     92\n",
       "2012     51\n",
       "2008     25\n",
       "       ... \n",
       "2004      4\n",
       "2001      4\n",
       "2007      3\n",
       "2003      3\n",
       "2000      2\n",
       "Name: Closing Date, Length: 15, dtype: int64"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 这里，我们可以做一些数据清洗和分析（后面章节会进一步讲解），\n",
    "# 比如计算按年份计算倒闭的银行数：\n",
    "\n",
    "close_timestamps = pd.to_datetime(failures['Closing Date'])\n",
    "close_timestamps.dt.year.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 利用lxml.objectify解析XML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 我们先用lxml.objectify解析该文件，然后通过getroot得到该XML文件的根节点的引用：\n",
    "\n",
    "from lxml import objectify\n",
    "\n",
    "path = '../datasets/mta_perf/Performance_MNR.xml'\n",
    "parsed = objectify.parse(open(path))\n",
    "root = parsed.getroot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "# root.INDICATOR返回一个用于产生各个<INDICATOR>XML元素的生成器。\n",
    "# 对于每条记录，我们可以用标记名（如YTD_ACTUAL）和数据值填充一个字典（排除几个标记）：\n",
    "\n",
    "data = []\n",
    "\n",
    "skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',\n",
    "               'DESIRED_CHANGE', 'DECIMAL_PLACES']\n",
    "\n",
    "for elt in root.INDICATOR:\n",
    "    el_data = {}\n",
    "    for child in elt.getchildren():\n",
    "        if child.tag in skip_fields:\n",
    "            continue\n",
    "        el_data[child.tag] = child.pyval\n",
    "    data.append(el_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>AGENCY_NAME</th>\n",
       "      <th>CATEGORY</th>\n",
       "      <th>DESCRIPTION</th>\n",
       "      <th>FREQUENCY</th>\n",
       "      <th>INDICATOR_NAME</th>\n",
       "      <th>INDICATOR_UNIT</th>\n",
       "      <th>MONTHLY_ACTUAL</th>\n",
       "      <th>MONTHLY_TARGET</th>\n",
       "      <th>PERIOD_MONTH</th>\n",
       "      <th>PERIOD_YEAR</th>\n",
       "      <th>YTD_ACTUAL</th>\n",
       "      <th>YTD_TARGET</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Metro-North Railroad</td>\n",
       "      <td>Service Indicators</td>\n",
       "      <td>Percent of commuter trains that arrive at thei...</td>\n",
       "      <td>M</td>\n",
       "      <td>On-Time Performance (West of Hudson)</td>\n",
       "      <td>%</td>\n",
       "      <td>96.9</td>\n",
       "      <td>95</td>\n",
       "      <td>1</td>\n",
       "      <td>2008</td>\n",
       "      <td>96.9</td>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Metro-North Railroad</td>\n",
       "      <td>Service Indicators</td>\n",
       "      <td>Percent of commuter trains that arrive at thei...</td>\n",
       "      <td>M</td>\n",
       "      <td>On-Time Performance (West of Hudson)</td>\n",
       "      <td>%</td>\n",
       "      <td>95</td>\n",
       "      <td>95</td>\n",
       "      <td>2</td>\n",
       "      <td>2008</td>\n",
       "      <td>96</td>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Metro-North Railroad</td>\n",
       "      <td>Service Indicators</td>\n",
       "      <td>Percent of commuter trains that arrive at thei...</td>\n",
       "      <td>M</td>\n",
       "      <td>On-Time Performance (West of Hudson)</td>\n",
       "      <td>%</td>\n",
       "      <td>96.9</td>\n",
       "      <td>95</td>\n",
       "      <td>3</td>\n",
       "      <td>2008</td>\n",
       "      <td>96.3</td>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Metro-North Railroad</td>\n",
       "      <td>Service Indicators</td>\n",
       "      <td>Percent of commuter trains that arrive at thei...</td>\n",
       "      <td>M</td>\n",
       "      <td>On-Time Performance (West of Hudson)</td>\n",
       "      <td>%</td>\n",
       "      <td>98.3</td>\n",
       "      <td>95</td>\n",
       "      <td>4</td>\n",
       "      <td>2008</td>\n",
       "      <td>96.8</td>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Metro-North Railroad</td>\n",
       "      <td>Service Indicators</td>\n",
       "      <td>Percent of commuter trains that arrive at thei...</td>\n",
       "      <td>M</td>\n",
       "      <td>On-Time Performance (West of Hudson)</td>\n",
       "      <td>%</td>\n",
       "      <td>95.8</td>\n",
       "      <td>95</td>\n",
       "      <td>5</td>\n",
       "      <td>2008</td>\n",
       "      <td>96.6</td>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            AGENCY_NAME            CATEGORY  \\\n",
       "0  Metro-North Railroad  Service Indicators   \n",
       "1  Metro-North Railroad  Service Indicators   \n",
       "2  Metro-North Railroad  Service Indicators   \n",
       "3  Metro-North Railroad  Service Indicators   \n",
       "4  Metro-North Railroad  Service Indicators   \n",
       "\n",
       "                                         DESCRIPTION FREQUENCY  \\\n",
       "0  Percent of commuter trains that arrive at thei...         M   \n",
       "1  Percent of commuter trains that arrive at thei...         M   \n",
       "2  Percent of commuter trains that arrive at thei...         M   \n",
       "3  Percent of commuter trains that arrive at thei...         M   \n",
       "4  Percent of commuter trains that arrive at thei...         M   \n",
       "\n",
       "                         INDICATOR_NAME INDICATOR_UNIT MONTHLY_ACTUAL  \\\n",
       "0  On-Time Performance (West of Hudson)              %           96.9   \n",
       "1  On-Time Performance (West of Hudson)              %             95   \n",
       "2  On-Time Performance (West of Hudson)              %           96.9   \n",
       "3  On-Time Performance (West of Hudson)              %           98.3   \n",
       "4  On-Time Performance (West of Hudson)              %           95.8   \n",
       "\n",
       "  MONTHLY_TARGET  PERIOD_MONTH  PERIOD_YEAR YTD_ACTUAL YTD_TARGET  \n",
       "0             95             1         2008       96.9         95  \n",
       "1             95             2         2008         96         95  \n",
       "2             95             3         2008       96.3         95  \n",
       "3             95             4         2008       96.8         95  \n",
       "4             95             5         2008       96.6         95  "
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 最后，将这组字典转换为一个DataFrame：\n",
    "\n",
    "perf = pd.DataFrame(data)\n",
    "perf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "# XML数据可以比本例复杂得多。\n",
    "# 每个标记都可以有元数据。\n",
    "# 看看下面这个HTML的链接标签（它也算是一段有效的XML）：\n",
    "\n",
    "from io import StringIO\n",
    "# tag = '<a href=\"http://www.google.com\">Google</a>'\n",
    "tag = '<a href=\"https://www.163.com\">网易</a>'\n",
    "root = objectify.parse(StringIO(tag)).getroot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Element a at 0x19d0f948>"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 现在就可以访问标签或链接文本中的任何字段了（如href）：\n",
    "root"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://www.163.com'"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "root.get('href')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'网易'"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "root.text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 6.2 二进制数据格式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>message</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>hello</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>world</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "      <td>12</td>\n",
       "      <td>foo</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a   b   c   d message\n",
       "0  1   2   3   4   hello\n",
       "1  5   6   7   8   world\n",
       "2  9  10  11  12     foo"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 实现数据的高效二进制格式存储最简单的办法之一是使用Python内置的pickle序列化。\n",
    "# pandas对象都有一个用于将数据以pickle格式保存到磁盘上的to_pickle方法：\n",
    "\n",
    "frame = pd.read_csv('ex1.csv')\n",
    "frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame.to_pickle('frame_pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>message</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>hello</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>world</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "      <td>12</td>\n",
       "      <td>foo</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a   b   c   d message\n",
       "0  1   2   3   4   hello\n",
       "1  5   6   7   8   world\n",
       "2  9  10  11  12     foo"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 你可以通过pickle直接读取被pickle化的数据，或是使用更为方便的pandas.read_pickle：\n",
    "\n",
    "pd.read_pickle('frame_pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 使用HDF5格式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 安装\n",
    "\n",
    "# 运行下面的代码之前需要先安装 `PyTables`\n",
    "# : http://www.pytables.org/usersguide/installation.html\n",
    "# 按照上面的文档介绍，安装 `PyTables` 之前需要先安装 ``\n",
    "# 参考这个链接 : http://www.pytables.org/usersguide/installation.html#prerequisites\n",
    "# HDF5 >= 1.8.5, NumPy >= 1.8.1, Numexpr >= 2.5.2 (HDF5==1.8.15, NumPy==1.10.2, Numexpr==2.5.2)\n",
    "\n",
    "# 参考 http://docs.h5py.org/en/latest/build.html (pip install h5py)\n",
    "# 参考 https://pypi.org/project/numexpr/ (pip install numexpr)\n",
    "# 参考 http://www.pytables.org/usersguide/installation.html#installation-with-pip (pip install tables)\n",
    "\n",
    "# 在 pip 中的名字可能和项目名称不同，所以特别列出"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<class 'pandas.io.pytables.HDFStore'>\n",
       "File path: mydata.h5"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 虽然可以用PyTables或h5py库直接访问HDF5文件，\n",
    "# pandas提供了更为高级的接口，可以简化存储Series和DataFrame对象。\n",
    "# HDFStore类可以像字典一样，处理低级的细节：\n",
    "\n",
    "frame = pd.DataFrame({'a': np.random.randn(100)})\n",
    "store = pd.HDFStore('mydata.h5')\n",
    "store['obj1'] = frame\n",
    "store['obj1_col'] = frame['a']\n",
    "\n",
    "store"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.446670</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.054714</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.958658</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.036809</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.989187</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>95</th>\n",
       "      <td>-1.027086</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96</th>\n",
       "      <td>0.195383</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97</th>\n",
       "      <td>0.019405</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98</th>\n",
       "      <td>-0.680558</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99</th>\n",
       "      <td>-0.831046</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>100 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           a\n",
       "0  -0.446670\n",
       "1   0.054714\n",
       "2  -0.958658\n",
       "3   0.036809\n",
       "4   0.989187\n",
       "..       ...\n",
       "95 -1.027086\n",
       "96  0.195383\n",
       "97  0.019405\n",
       "98 -0.680558\n",
       "99 -0.831046\n",
       "\n",
       "[100 rows x 1 columns]"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# HDF5文件中的对象可以通过与字典一样的API进行获取：\n",
    "\n",
    "store['obj1']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>-1.910411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>-0.363381</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>-0.105853</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>1.826142</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>-0.375785</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>-0.657847</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           a\n",
       "10 -1.910411\n",
       "11 -0.363381\n",
       "12 -0.105853\n",
       "13  1.826142\n",
       "14 -0.375785\n",
       "15 -0.657847"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# HDFStore支持两种存储模式，'fixed'和'table'。\n",
    "# 后者通常会更慢，但是支持使用特殊语法进行查询操作：\n",
    "\n",
    "# put是store['obj2'] = frame方法的显示版本，允许我们设置其它的选项，比如格式。\n",
    "store.put('obj2', frame, format='table')\n",
    "store.select('obj2', where=['index >= 10 and index <= 15'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "store.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.446670</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.054714</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.958658</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.036809</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.989187</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          a\n",
       "0 -0.446670\n",
       "1  0.054714\n",
       "2 -0.958658\n",
       "3  0.036809\n",
       "4  0.989187"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# pandas.read_hdf函数可以快捷使用这些工具：\n",
    "\n",
    "frame.to_hdf('mydata.h5', 'obj3', format='table')\n",
    "pd.read_hdf('mydata.h5', 'obj3', where=['index < 5'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 如果你要处理的数据位于远程服务器，\n",
    "# 比如Amazon S3或HDFS，\n",
    "# 使用专门为分布式存储（比如Apache Parquet）的二进制格式也许更加合适。\n",
    "# Python的Parquet和其它存储格式还在不断的发展之中，所以这本书中没有涉及。\n",
    "\n",
    "# 如果需要本地处理海量数据，我建议你好好研究一下PyTables和h5py，\n",
    "# 看看它们能满足你的哪些需求。\n",
    "# 由于许多数据分析问题都是IO密集型（而不是CPU密集型），\n",
    "# 利用HDF5这样的工具能显著提升应用程序的效率。\n",
    "\n",
    "# 注意：HDF5不是数据库。它最适合用作“一次写多次读”的数据集。\n",
    "# 虽然数据可以在任何时候被添加到文件中，但如果同时发生多个写操作，文件就可能会被破坏。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读取Microsoft Excel文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 安装\n",
    "\n",
    "# pandas的ExcelFile类或pandas.read_excel函数\n",
    "# 支持读取存储在Excel 2003（或更高版本）中的表格型数据。\n",
    "# 这两个工具分别使用扩展包xlrd和openpyxl读取XLS和XLSX文件。\n",
    "\n",
    "# xlrd : https://pypi.org/project/xlrd/ (pip install xlrd)\n",
    "# openpyxl : https://pypi.org/project/openpyxl/ (pip install openpyxl)\n",
    "\n",
    "# 经测试，以上两个库都需要安装，pandas都需要！"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "xlsx = pd.ExcelFile('ex1.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>message</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>hello</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>world</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "      <td>12</td>\n",
       "      <td>foo</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a   b   c   d message\n",
       "0  1   2   3   4   hello\n",
       "1  5   6   7   8   world\n",
       "2  9  10  11  12     foo"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 存储在表单中的数据可以read_excel读取到DataFrame\n",
    "pd.read_excel(xlsx, 'Sheet1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>message</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>hello</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>world</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "      <td>12</td>\n",
       "      <td>foo</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a   b   c   d message\n",
       "0  1   2   3   4   hello\n",
       "1  5   6   7   8   world\n",
       "2  9  10  11  12     foo"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 如果要读取一个文件中的多个表单，\n",
    "# 创建ExcelFile会更快，但你也可以将文件名传递到pandas.read_excel：\n",
    "frame = pd.read_excel('ex1.xlsx', 'Sheet1')\n",
    "frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 如果要将pandas数据写入为Excel格式，\n",
    "# 你必须首先创建一个ExcelWriter，然后使用pandas对象的to_excel方法将数据写入到其中：\n",
    "\n",
    "writer = pd.ExcelWriter('ex2.xlsx')\n",
    "frame.to_excel(writer, 'Sheet1')\n",
    "writer.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 你还可以不使用ExcelWriter，而是传递文件的路径到to_excel：\n",
    "frame.to_excel('ex2.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 6.3 Web APIs交互"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 许多网站都有一些通过JSON或其他格式提供数据的公共API。\n",
    "# 通过Python访问这些API的办法有不少。\n",
    "# 一个简单易用的办法（推荐）是requests包（http://docs.python-requests.org）。\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# requests : https://pypi.org/project/requests/\n",
    "# pip install requests\n",
    "# 这个库我已经安装了！"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 为了搜索最新的30个GitHub上的pandas主题，\n",
    "# 我们可以发一个HTTP GET请求，使用requests扩展库："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Response [200]>"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import requests\n",
    "url = 'https://api.github.com/repos/pandas-dev/pandas/issues'\n",
    "resp = requests.get(url)\n",
    "resp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'TST: Split out test_pytables.py to sub-module of tests'"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 响应对象的json方法会返回一个包含被解析过的JSON字典，加载到一个Python对象中：\n",
    "\n",
    "data = resp.json()\n",
    "data[0]['title']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>number</th>\n",
       "      <th>title</th>\n",
       "      <th>labels</th>\n",
       "      <th>state</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>25004</td>\n",
       "      <td>TST: Split out test_pytables.py to sub-module ...</td>\n",
       "      <td>[]</td>\n",
       "      <td>open</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>25001</td>\n",
       "      <td>REGR: re-evaluate merge fix of  PR #24916</td>\n",
       "      <td>[{'id': 77550281, 'node_id': 'MDU6TGFiZWw3NzU1...</td>\n",
       "      <td>open</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>25000</td>\n",
       "      <td>Bug: on .to_string(index=False)</td>\n",
       "      <td>[{'id': 13101118, 'node_id': 'MDU6TGFiZWwxMzEw...</td>\n",
       "      <td>open</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>24994</td>\n",
       "      <td>0.24.0 DateTimeIndex: union() got an unexpecte...</td>\n",
       "      <td>[{'id': 49597148, 'node_id': 'MDU6TGFiZWw0OTU5...</td>\n",
       "      <td>open</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>24993</td>\n",
       "      <td>Test nested PandasArray</td>\n",
       "      <td>[]</td>\n",
       "      <td>open</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>24961</td>\n",
       "      <td>fix+test to_timedelta('NaT', box=False)</td>\n",
       "      <td>[{'id': 32815646, 'node_id': 'MDU6TGFiZWwzMjgx...</td>\n",
       "      <td>open</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>24959</td>\n",
       "      <td>Index.intersection changed behavior to sort by...</td>\n",
       "      <td>[{'id': 35818298, 'node_id': 'MDU6TGFiZWwzNTgx...</td>\n",
       "      <td>open</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>24958</td>\n",
       "      <td>CLN: isort asv_bench/benchmark/algorithms.py</td>\n",
       "      <td>[{'id': 211029535, 'node_id': 'MDU6TGFiZWwyMTE...</td>\n",
       "      <td>open</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>24957</td>\n",
       "      <td>BUG: to_timedelta('NaT', box=False) returns da...</td>\n",
       "      <td>[{'id': 32815646, 'node_id': 'MDU6TGFiZWwzMjgx...</td>\n",
       "      <td>open</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>24956</td>\n",
       "      <td>BUG: Better handle larger numbers in to_numeric</td>\n",
       "      <td>[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...</td>\n",
       "      <td>open</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>30 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    number                                              title  \\\n",
       "0    25004  TST: Split out test_pytables.py to sub-module ...   \n",
       "1    25001          REGR: re-evaluate merge fix of  PR #24916   \n",
       "2    25000                    Bug: on .to_string(index=False)   \n",
       "3    24994  0.24.0 DateTimeIndex: union() got an unexpecte...   \n",
       "4    24993                            Test nested PandasArray   \n",
       "..     ...                                                ...   \n",
       "25   24961            fix+test to_timedelta('NaT', box=False)   \n",
       "26   24959  Index.intersection changed behavior to sort by...   \n",
       "27   24958       CLN: isort asv_bench/benchmark/algorithms.py   \n",
       "28   24957  BUG: to_timedelta('NaT', box=False) returns da...   \n",
       "29   24956    BUG: Better handle larger numbers in to_numeric   \n",
       "\n",
       "                                               labels state  \n",
       "0                                                  []  open  \n",
       "1   [{'id': 77550281, 'node_id': 'MDU6TGFiZWw3NzU1...  open  \n",
       "2   [{'id': 13101118, 'node_id': 'MDU6TGFiZWwxMzEw...  open  \n",
       "3   [{'id': 49597148, 'node_id': 'MDU6TGFiZWw0OTU5...  open  \n",
       "4                                                  []  open  \n",
       "..                                                ...   ...  \n",
       "25  [{'id': 32815646, 'node_id': 'MDU6TGFiZWwzMjgx...  open  \n",
       "26  [{'id': 35818298, 'node_id': 'MDU6TGFiZWwzNTgx...  open  \n",
       "27  [{'id': 211029535, 'node_id': 'MDU6TGFiZWwyMTE...  open  \n",
       "28  [{'id': 32815646, 'node_id': 'MDU6TGFiZWwzMjgx...  open  \n",
       "29  [{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...  open  \n",
       "\n",
       "[30 rows x 4 columns]"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# data中的每个元素都是一个包含所有GitHub主题页数据（不包含评论）的字典。\n",
    "# 我们可以直接传递数据到DataFrame，并提取感兴趣的字段：\n",
    "\n",
    "issues = pd.DataFrame(data, columns=['number', 'title',\n",
    "                                     'labels', 'state'])\n",
    "issues"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 花费一些精力，你就可以创建一些更高级的常见的Web API的接口，\n",
    "# 返回DataFrame对象，方便进行分析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 6.4 数据库交互"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "# sqlite3 : \n",
    "# https://pypi.org/project/pysqlite/\n",
    "\n",
    "# pip install pysqlite\n",
    "\n",
    "# https://github.com/ghaering/pysqlite\n",
    "# https://pysqlite.readthedocs.io/en/latest/sqlite3.html\n",
    "\n",
    "# https://stackoverflow.com/questions/19530974/how-can-i-install-sqlite3-to-python\n",
    "\n",
    "# https://pypi.org/project/pysqlite3/\n",
    "# https://github.com/coleifer/pysqlite3\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 安装\n",
    "\n",
    "# 最终我是参考 ： https://pypi.org/project/pysqlite3/\n",
    "# pip install pysqlite3\n",
    "# python -c \"import sqlite3\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<sqlite3.Cursor at 0x1c13b650>"
      ]
     },
     "execution_count": 116,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import sqlite3\n",
    "\n",
    "query = \"\"\"\n",
    "CREATE TABLE test\n",
    "(a VARCHAR(20), b VARCHAR(20),\n",
    " c REAL,        d INTEGER\n",
    ");\"\"\"\n",
    "\n",
    "con = sqlite3.connect('mydata.sqlite')\n",
    "\n",
    "con.execute(query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [],
   "source": [
    "con.commit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<sqlite3.Cursor at 0x1c0d59d0>"
      ]
     },
     "execution_count": 118,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 然后插入几行数据：\n",
    "\n",
    "data = [('Atlanta', 'Georgia', 1.25, 6),\n",
    "        ('Tallahassee', 'Florida', 2.6, 3),\n",
    "        ('Sacramento', 'California', 1.7, 5)]\n",
    "\n",
    "stmt = \"INSERT INTO test VALUES(?, ?, ?, ?)\"\n",
    "\n",
    "con.executemany(stmt, data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('Atlanta', 'Georgia', 1.25, 6),\n",
       " ('Tallahassee', 'Florida', 2.6, 3),\n",
       " ('Sacramento', 'California', 1.7, 5)]"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 从表中选取数据时，\n",
    "# 大部分Python SQL驱动器（PyODBC、psycopg2、MySQLdb、pymssql等）都会返回一个元组列表：\n",
    "\n",
    "cursor = con.execute('select * from test')\n",
    "rows = cursor.fetchall()\n",
    "rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(('a', None, None, None, None, None, None),\n",
       " ('b', None, None, None, None, None, None),\n",
       " ('c', None, None, None, None, None, None),\n",
       " ('d', None, None, None, None, None, None))"
      ]
     },
     "execution_count": 120,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 你可以将这个元组列表传给DataFrame构造器，但还需要列名（位于光标的description属性中）：\n",
    "cursor.description"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Atlanta</td>\n",
       "      <td>Georgia</td>\n",
       "      <td>1.25</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Tallahassee</td>\n",
       "      <td>Florida</td>\n",
       "      <td>2.60</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Sacramento</td>\n",
       "      <td>California</td>\n",
       "      <td>1.70</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             a           b     c  d\n",
       "0      Atlanta     Georgia  1.25  6\n",
       "1  Tallahassee     Florida  2.60  3\n",
       "2   Sacramento  California  1.70  5"
      ]
     },
     "execution_count": 121,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(rows, columns=[x[0] for x in cursor.description])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 这种数据规整操作相当多，你肯定不想每查一次数据库就重写一次。\n",
    "# SQLAlchemy项目是一个流行的Python SQL工具，它抽象出了SQL数据库中的许多常见差异。\n",
    "# pandas有一个read_sql函数，可以让你轻松的从SQLAlchemy连接读取数据。\n",
    "# 这里，我们用SQLAlchemy连接SQLite数据库，并从之前创建的表读取数据：\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 安装\n",
    "\n",
    "# SQLAlchemy \n",
    "# 参考 : https://pypi.org/project/SQLAlchemy/\n",
    "# pip install SQLAlchemy\n",
    "# python -c \"import sqlalchemy\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [a, b, c, d]\n",
       "Index: []"
      ]
     },
     "execution_count": 129,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import sqlalchemy as sqla\n",
    "db = sqla.create_engine('sqlite:///mydata.sqlite')\n",
    "pd.read_sql('select * from test', db)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 6.5 总结"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
